
eba_fit_dw <- function(dep.vars = c("acc_vert", "v2x_corr"), 
                    dep.names = c("Vertical Accountability", "Corruption Index"), 
                    indirect.var = c("lag1_comb_indirect"), # note you need "lag1" here because the variables are lagged before analysis
                    direct.var = c("lag1_direct_ex_sc_ex_rt"), # note you need "lag1" here because the variables are lagged before analysis
                    # IMPORTANT: these 2 variables determine which coefficients are differenced
                    main.vars = c("rev_ex_gr_ex_sc", "comb_indirect",
                                  "direct_ex_sc_ex_rt", "nontax"), 
                    add.vars = dfVar$var_name[dfVar$var_group == "additional"], 
                    key.vars = dfVar$var_name[dfVar$var_group == "key"],
                    factor.vars = c("region"), # Any variables you want as factors include here
                    add.terms = c(), # c("region:year", "region:year_sq") # This is where you can add in the linear time trends etc
                    data = dfMerge, 
                    unit.id = c("ccode"), # unit identifier
                    time.id = c("year"), # time identifier (need this and above to grab correct number of unique IDs from fixest object)
                    se.type = c("cluster"), 
                    cluster.var = c("year"), 
                    clus.level = c("time"),
                    fe.vars = c("year", "ccode"), # which variables you want to use as FEs
                    samp.num = 500, 
                    min.select = 1, # This is the number of variables that will be selected from the larger list
                    max.select = 5, 
                    inc.mod = FALSE,  # Set this to true when you need to see the feols output
                    conduct.ar = TRUE){
                     
  if(!exists("dfVar")){stop("Variable data frame dfVar not present -- load before running function")}
  
  ## Load necessary packages
  
  require(pacman)
  pacman::p_load(dplyr)
  
  ## Run smaller version of EBA
  dfUse <- pdata.frame(data, index = c(unit.id, time.id))
  dfUse <- make.pbalanced(dfUse, balance.type = "fill") # Fill all
  depVars <- dep.vars
  depNames <- dep.names
  addVars <- add.vars
  keyVars <- key.vars
  mainVars <-  main.vars # specify these manually, you don't want them all, note currently no non-tax
  feVars <- fe.vars
  factorVars <- factor.vars
  indirectVar <-  indirect.var
  directVar <-  direct.var
  addTerms <- add.terms 
  clusVar <- cluster.var
  seType <- se.type
  sampNum <-  samp.num # for final, should be modNum
  levCluster <- clus.level
  incMod <- inc.mod
  conductAR <- conduct.ar

  
  ## Change any variables you need to factor (when using additional FE, for example)
  
  for(i in 1:length(factorVars)){dfUse[, factorVars[i]] <- as.factor(dfUse[, factorVars[i]])}
  
  for(i in 1:length(feVars)){dfUse[, feVars[i]] <- as.factor(dfUse[, feVars[i]])}
  
  ## Check total number of non-missing rows in base model + fully specified model
  
  nrow(na.omit(dfUse[, c(keyVars, mainVars, depVars, 
                         indirectVar, directVar)]))/nrow(dfUse)
  nrow(na.omit(dfUse[, c(keyVars, mainVars, depVars, 
                         indirectVar, directVar, addVars)]))/nrow(dfUse)
  
  dvList <- list()
  pb <- txtProgressBar(min = 0, max = length(depVars)*sampNum, style = 3)
  k <- 0 
  g <- 0
  
  ############### BEGIN LOOP OVER DVS
  for(j in 1:length(depVars)){
  
    ###### BASE MODEL: TWO-WAY FE 
    
    ## Set key parameters
    
    numSelect <- min.select:max.select # This is the number of variables you'll select from addVars
    (modNum <- sum(sapply(numSelect, FUN = function(x){choose(length(addVars), x)}))) # Total number of models doing 20 choose 10
    seType <- se.type  # can be "classic", "robust" or "cluster"
    minYears <- 2 # Minimum number of years that a country can be in the dataset after accounting for missingness
    depUse <- depVars[j]
    baseVars <- paste("lag1_", c(mainVars, keyVars), sep = "")
    if(depVars[j] == "polity2"){baseVars <- baseVars[!baseVars %in% "lag1_regime"]} # Don't want to include regime as a covariate when the DV is polity score
    eqBase <- make_equation(depUse, c(baseVars, addTerms)) 
    
    ## Build list of all possible RHS variables
    
    formCall <- c()
    for(i in min.select:max.select){
      combs <- combn(paste("lag1_", addVars, sep = ""), i)
      formCall <- c(formCall, apply(combs, 2, FUN = function(x){make_equation(depUse, c(baseVars, addTerms, x))}))
    }
    
    ## Randomly pick a subset of the potential models
    
    formUse <- c(eqBase, formCall[sample(length(formCall), size = sampNum, replace = F)]) # This is faster than sampling directly from formCall and will not create memory problems
          # Note that we add the base model here so that you can estimate the effects of adding each of the aux vars
    formUseplm <- c(eqBase, formCall[sample(length(formCall), size = sampNum, replace = F)])
    ## Estimate model
    dfEst <- dfMod <- data.frame()
    dfEstplm <- dfModplm <- data.frame()
    modObj <- list()
    modObjplm <- list()
    #print(paste0("Beginning estimation for", depNames[j], "!"))
    for(s in 1){
    for(i in 1:length(formUseplm)){
      
      ## Generate objects necessary for analysis and model diagnostics later
      #useEq <- as.formula(formUse[i])
      useEqplm <- as.formula(formUseplm[i])
      #useVars <- all.vars(useEq)
      useVarsplm <- all.vars(useEqplm)
      dfLoop <- dfUse
      dfLoop$year <- as.numeric(dfLoop$year)
      dfLoop$year_sq <- as.numeric(dfLoop$year_sq)
      panelInfo <- tapply(dfLoop$year, dfLoop$ccode, length)
      rmObs <- which(dfLoop$ccode %in% as.numeric(names(panelInfo[panelInfo < minYears]))) # Remove any countries that only show up once, can't take a mean there
      if(length(rmObs > 0)){dfLoop <- dfLoop[-rmObs, ]} # Remove any rows that don't fit criteria above
      
      ## Modified Durbin-Watson Test
      #modEqplm <- paste(formUseplm[i], " |", " + ", collapse = "")
      modplm <- plm(as.formula(formUseplm[i]), model = c("within"), effect = c("twoways"), data = dfLoop, 
                    index = c("ccode", "year"))
      if(incMod){modObjplm[[i]] <- modplm}
      if(conductAR){arStat <- pbnftest(modplm)$statistic}else{arStat <- NA} # Note that because panel is unbalanced test is modified Durbin-Watson test, recommendation is statistic < 2 suggests no positive serial correlation
      
      vcovUse <- vcovHC(modplm, cluster = "time")
      estMatplm <- as.data.frame(t(coeftest(modplm, vcov. = vcovUse)[, 1:4]))
      #estMatplm <- as.data.frame(t(summary(modplm, se = seType)$coeftable[, 1:4]))
      rownames(estMatplm) <- c("beta", "se", "t", "p")
      
      ## Estimate model (feols)
      #modEq <- paste(formUse[i], " |", paste(feVars, collapse = " + "), collapse = "")
      #mod <- feols(as.formula(modEq), data = dfLoop, notes = FALSE) # notes = F removes
      #if(incMod){modObj[[i]] <- mod}
      
      #if(seType != "cluster"){
      #  estMat <- as.data.frame(t(summary(mod, se = seType)$coeftable[, 1:4]))  
      #}
      #if(seType == "cluster"){
      #  estMat <- as.data.frame(t(summary(mod, cluster = dfLoop[, clusVar])$coeftable[, 1:4]))
      #}
      ## Format output and create vcov for coefficient test
      
      
      
      #rownames(estMat) <- c("beta", "se", "t", "p")
      
      ## Compare coefficients on direct and indirect taxation
      
      #coefs <- summary(mod)$coeftable
      #betaDiff <- coefs[rownames(coefs) == directVar, 1] - coefs[rownames(coefs) == indirectVar, 1]
      #locDirect <- which(rownames(mod$cov.unscaled) == directVar)
      #locIndirect <- which(rownames(mod$cov.unscaled) == indirectVar)
      #seDiff <- sqrt(mod$cov.unscaled[locDirect, locDirect] 
      #               + mod$cov.unscaled[locIndirect, locIndirect]
      #               + 2*mod$cov.unscaled[locDirect, locIndirect])
      #dfEst <- rbind.fill(dfEst, estMat)
      #newStats <- c(betaDiff, # difference between indirect and direct taxation coefficients
      #              seDiff, # se of difference
      #              betaDiff/seDiff, # t of difference
      #              2*(1 - pnorm(abs(betaDiff/seDiff))), # p of differece (note this is two-sided)
      #              1 - sum(mod$residuals^2)/mod$ssr_null, # r-squared
      #              #arStat, # modified Durbin-Watson test
      #              mod$nobs,
      #              mod$nobs/nrow(dfLoop), # This is the % of non-missing data
      #              mod$fixef_sizes[unit.id], # number of unique unit IDs in estimation data frame
      #              mod$fixef_sizes[time.id]) # number of unique time IDs in estimation data frame
      #dfMod <- rbind(dfMod, newStats)
      #  
      coefsplm <- summary(modplm)$coefficients
      betaDiffplm <- coefsplm[rownames(coefsplm) == directVar, 1] - coefsplm[rownames(coefsplm) == indirectVar, 1]
      locDirectplm <- which(rownames(vcovUse) == directVar)
      locIndirectplm <- which(rownames(vcovUse) == indirectVar)
      seDiffplm <- sqrt(vcovUse[locDirectplm, locDirectplm] 
                        + vcovUse[locIndirectplm, locIndirectplm]
                        - 2*vcovUse[locDirectplm, locIndirectplm])
      dfEstplm <- rbind.fill(dfEstplm, estMatplm)
      newStatsplm <- c(betaDiffplm, # difference between indirect and direct taxation coefficients
                       seDiffplm, # se of difference
                       betaDiffplm/seDiffplm, # t of difference
                       2*(1 - pnorm(abs(betaDiffplm/seDiffplm))), # p of differece (note this is two-sided)
                       summary(modplm)$r.squared, # r-squared
                       arStat, # modified Durbin-Watson test
                       nrow(model.frame(modplm)),
                       nrow(model.frame(modplm))/nrow(dfLoop), # This is the % of non-missing data
                       length(unique(dfLoop$ccode)), # number of unique unit IDs in estimation data frame
                       length(unique(dfLoop$year))) # number of unique time IDs in estimation data frame
      dfModplm <- rbind(dfModplm, newStatsplm)
      
      g <- g + 1
      setTxtProgressBar(pb, g)
    }       
    
    for(i in 1:length(formUse)){
      
      ## Generate objects necessary for analysis and model diagnostics later
      useEq <- as.formula(formUse[i])
      #useEqplm <- as.formula(formUseplm[i])
      useVars <- all.vars(useEq)
      #useVarsplm <- all.vars(useEqplm)
      dfLoop <- dfUse
      dfLoop$year <- as.numeric(dfLoop$year)
      dfLoop$year_sq <- as.numeric(dfLoop$year_sq)
      panelInfo <- tapply(dfLoop$year, dfLoop$ccode, length)
      rmObs <- which(dfLoop$ccode %in% as.numeric(names(panelInfo[panelInfo < minYears]))) # Remove any countries that only show up once, can't take a mean there
      if(length(rmObs > 0)){dfLoop <- dfLoop[-rmObs, ]} # Remove any rows that don't fit criteria above
      
      ## Modified Durbin-Watson Test
      #modEqplm <- paste(formUseplm[i], " |", paste(feVars, collapse = " + "), collapse = "")
      #modplm <- plm(as.formula(modEqplm), model = c("within"), effect = c("twoways"), data = dfLoop, 
      #           index = c("ccode", "year"))
      #if(incMod){modObjplm[[i]] <- modplm}
      #if(conductAR){arStat <- pbnftest(modplm)$statistic}else{arStat <- NA} # Note that because panel is unbalanced test is modified Durbin-Watson test, recommendation is statistic < 2 suggests no positive serial correlation
      
      #vcovUse <- vcovHC(modplm, cluster = "time")
      #estMatplm <- as.data.frame(t(coeftest(modplm, vcov. = vcovUse)[, 1:4]))
      #estMatplm <- as.data.frame(t(summary(modplm, se = seType)$coeftable[, 1:4]))
      #rownames(estMatplm) <- c("beta", "se", "t", "p")

      ## Estimate model (feols)
      modEq <- paste(formUse[i], " |", paste(feVars, collapse = " + "), collapse = "")
      mod <- feols(as.formula(modEq), data = dfLoop, notes = FALSE) # notes = F removes
      if(incMod){modObj[[i]] <- mod}
      
      if(seType != "cluster"){
        estMat <- as.data.frame(t(summary(mod, se = seType)$coeftable[, 1:4]))  
      }
      if(seType == "cluster"){
        estMat <- as.data.frame(t(summary(mod, cluster = dfLoop[, clusVar])$coeftable[, 1:4]))
      }
      ## Format output and create vcov for coefficient test
      
        
      
      rownames(estMat) <- c("beta", "se", "t", "p")
      
      ## Compare coefficients on direct and indirect taxation
      
      coefs <- summary(mod)$coeftable
      betaDiff <- coefs[rownames(coefs) == directVar, 1] - coefs[rownames(coefs) == indirectVar, 1]
      locDirect <- which(rownames(mod$cov.unscaled) == directVar)
      locIndirect <- which(rownames(mod$cov.unscaled) == indirectVar)
      seDiff <- sqrt(mod$cov.unscaled[locDirect, locDirect] 
                     + mod$cov.unscaled[locIndirect, locIndirect]
                     + 2*mod$cov.unscaled[locDirect, locIndirect])
      dfEst <- rbind.fill(dfEst, estMat)
      newStats <- c(betaDiff, # difference between indirect and direct taxation coefficients
                    seDiff, # se of difference
                    betaDiff/seDiff, # t of difference
                    2*(1 - pnorm(abs(betaDiff/seDiff))), # p of differece (note this is two-sided)
                    1 - sum(mod$residuals^2)/mod$ssr_null, # r-squared
                    #arStat, # modified Durbin-Watson test
                    mod$nobs,
                    mod$nobs/nrow(dfLoop), # This is the % of non-missing data
                    mod$fixef_sizes[unit.id], # number of unique unit IDs in estimation data frame
                    mod$fixef_sizes[time.id]) # number of unique time IDs in estimation data frame
      dfMod <- rbind(dfMod, newStats)
      
      #coefsplm <- summary(modplm)$coeftable
      #betaDiffplm <- coefsplm[rownames(coefsplm) == directVar, 1] - coefsplm[rownames(coefsplm) == indirectVar, 1]
      #locDirectplm <- which(rownames(modplm$cov.unscaled) == directVar)
      #locIndirectplm <- which(rownames(modplm$cov.unscaled) == indirectVar)
      #seDiffplm <- sqrt(modplm$cov.unscaled[locDirectplm, locDirectplm] 
      #               + modplm$cov.unscaled[locIndirectplm, locIndirectplm]
      #               + 2*modplm$cov.unscaled[locDirectplm, locIndirectplm])
      #dfEstplm <- rbind.fill(dfEstplm, estMatplm)
      #newStatsplm <- c(betaDiffplm, # difference between indirect and direct taxation coefficients
      #              seDiffplm, # se of difference
      #              betaDiffplm/seDiffplm, # t of difference
      #              2*(1 - pnorm(abs(betaDiffplm/seDiffplm))), # p of differece (note this is two-sided)
      #              1 - sum(modplm$residuals^2)/modplm$ssr_null, # r-squared
      #              arStat, # modified Durbin-Watson test
      #              modplm$nobs,
      #              modplm$nobs/nrow(dfLoop), # This is the % of non-missing data
      #              modplm$fixef_sizes[unit.id], # number of unique unit IDs in estimation data frame
      #              modplm$fixef_sizes[time.id]) # number of unique time IDs in estimation data frame
      #dfModplm <- rbind(dfModplm, newStatsplm)
      
      k <- k + 1
      setTxtProgressBar(pb, k)
    } 
}
 
    ## Format the data frame
    
    colnames(dfMod) <- c("diff_coef", "diff_se", "diff_t", "diff_p",
                         "rsq", "nobs", "prop_nonmiss", "num_ccodes",  # Note that num_ccodes and num_years are the number of unique IDs in the estimation data frame (i.e. after listwise deletion etc)
                         "num_years") # make sure these names match up with order in newStats
    dfMod[1:length(colnames(dfMod))] <- round(sapply(dfMod[1:length(colnames(dfMod))], as.numeric), digits = 4)
    
    colnames(dfModplm) <- c("diff_coef", "diff_se", "diff_t", "diff_p",
                         "rsq", "adj_rsq", "arStat", "nobs", "prop_nonmiss", "num_ccodes",  # Note that num_ccodes and num_years are the number of unique IDs in the estimation data frame (i.e. after listwise deletion etc)
                         "num_years") # make sure these names match up with order in newStats
    dfModplm[1:length(colnames(dfModplm))] <- round(sapply(dfModplm[1:length(colnames(dfModplm))], as.numeric), digits = 4)
    ## Unpack the data frame dfEst
    
    listEst <- list()
    qoi <- c("beta", "se", "t", "p")
    for(i in 1:length(qoi)){
      listEst[[qoi[i]]] <- dfEst[seq(from = i, to = nrow(dfEst), by = length(qoi)), ]
    }
    
    listEstplm <- list()
    qoiplm <- c("beta", "se", "t", "p")
    for(i in 1:length(qoiplm)){
      listEstplm[[qoiplm[i]]] <- estMatplm[seq(from = i, to = nrow(estMatplm), by = length(qoiplm)), ]
    }
    
    ## Average and median estimates 
    
    (dfCoefsAvg <- round(simplify2array(lapply(listEst, FUN = function(x){apply(x, 2, mean, na.rm = T)})), digits = 4))
    (dfCoefsAvgplm <- round(simplify2array(lapply(listEstplm, FUN = function(x){apply(x, 2, mean, na.rm = T)})), digits = 4))
    
    ## Some summary information about the coefficients from the models
    
    propNeg <- apply(listEst[[1]], 2, FUN = function(x){
      mean(ifelse(x < 0, 1, 0), na.rm = T)})
    propSig <- apply(listEst[[4]], 2, FUN = function(x){
      mean(ifelse(x < 0.1, 1, 0), na.rm = T)})
    posCoefs <- apply(listEst[[1]], 2, 
                      FUN = function(x){ifelse(x > 0, 1, -1)})
    sigCoefs <- apply(listEst[[4]], 2, 
                      FUN = function(x){ifelse(x < 0.1, 1, 0)})
    sigNeg <- apply(posCoefs*sigCoefs, 2, 
                    FUN = function(x){mean(ifelse(x == -1, 1, 0), na.rm = T)})
    sigPos <- apply(posCoefs*sigCoefs, 2, 
                    FUN = function(x){mean(ifelse(x == 1, 1, 0), na.rm = T)})
    
    dfSigAvg <- round(data.frame(prop_neg = propNeg, 
                                 prop_pos = 1 - propNeg, 
                                 prop_sigpos = sigPos,
                                 prop_signeg = sigNeg), digits = 4)
    
    
    ## Weighted average
    
    #weights <- dfMod$adj_rsq/sum(dfMod$adj_rsq, na.rm = T)
    weights <- c(dfMod$prop_nonmiss*dfMod$rsq/sum(dfMod$prop_nonmiss*dfMod$rsq))
    
    (dfCoefsWeighted <- round(simplify2array(lapply(listEst, FUN = function(x){
      apply(x, 2, FUN = function(x){weighted.mean(x, w = weights, na.rm = T)})})), digits = 4))
    
    
    ## Some summary information about the coefficients from the models
    
    propNeg <- apply(listEst[[1]], 2, FUN = function(x){
      weighted.mean(ifelse(x < 0, 1, 0), w = weights, na.rm = T)})
    propSig <- apply(listEst[[4]], 2, FUN = function(x){
      weighted.mean(ifelse(x < 0.1, 1, 0), w = weights, na.rm = T)})
    posCoefs <- apply(listEst[[1]], 2, 
                      FUN = function(x){ifelse(x > 0, 1, -1)})
    sigCoefs <- apply(listEst[[4]], 2, 
                      FUN = function(x){ifelse(x < 0.1, 1, 0)})
    sigNeg <- apply(posCoefs*sigCoefs, 2, 
                    FUN = function(x){weighted.mean(ifelse(x == -1, 1, 0), 
                                                    w = weights, na.rm = T)})
    sigPos <- apply(posCoefs*sigCoefs, 2, 
                    FUN = function(x){weighted.mean(ifelse(x == 1, 1, 0), 
                                                    w = weights, na.rm = T)})
    
    dfSigWeighted <- round(data.frame(prop_neg = propNeg, 
                                      prop_pos = 1 - propNeg, 
                                      prop_sigpos = sigPos,
                                      prop_signeg = sigNeg), digits = 4)
    
    
    ## Add this to higher level list
    
    dfList <- list(output = listEst, 
                   outputplm = listEstplm, 
                   model = dfMod, # This gives model summary statistics: rsq, adjrsq, artest, number of observations, proportion of non-missing observations, number of countries in sample, minimum number of time periods per country, maximum, average
                   modelplm = dfModplm, 
                   sig_avg = dfSigAvg,
                   sig_weighted = dfSigWeighted, 
                   coefs_avg = dfCoefsAvg,
                   coefs_weighted = dfCoefsWeighted,
                   mod_obj = modObj, 
                   mod_objplm = modObjplm, 
                   form_call = formUse)
    
    dvList[[j]] <- dfList
    
    
    ################## Visualization #############################
    
    ## Create a factor class vector denoting which coefficient (indirect/direct)
    
    nameVec <- factor(c(rep("Direct Taxes (% GDP)", nrow(dfList$output$beta)), 
                        rep("Indirect Taxes (% GDP)", nrow(dfList$output$beta))), 
                      levels = c("Direct Taxes (% GDP)", "Indirect Taxes (% GDP)"))
    colorVec <- factor(c(rep("darkgray", nrow(dfList$output$beta)), 
                         rep("lightgray", nrow(dfList$output$beta))), 
                       levels = c("Direct Taxes (% GDP)", "Indirect Taxes (% GDP)"))
    estVec <- c(dfList$output$beta$lag1_direct_ex_sc_ex_rt, 
                dfList$output$beta$lag1_comb_indirect)
    
    ## Show distributions in a single plot
    
    dfPlot <- data.frame(est = estVec, var_name = nameVec)
    p <- plot_multi_histogram(dfPlot, "est", "var_name") + 
      ylab("Density\n") + 
      xlab("\nEffect on DV") + 
      scale_fill_manual(values = c("gray30", "lightgray")) + 
      guides(fill = guide_legend("Tax Type")) +
      theme(legend.position = "bottom", 
            legend.justification = 0.5) + 
      ggtitle(depNames[j])
    dvList[[j]]$plot <- p
    
  }
  
  names(dvList) <- depVars
  return(dvList)
  
}
